Meta-Tunning 4

중첩 반복문 튜닝
밀집 행렬의 곱셈
const unsigned s=128;
matrix<float> A(s, s), B(s, s), C(s, s);
for(unsigned i=0; i<s; ++i){
for(unsigned j=0; j<s; j++){
A(i, j)=100.0*i+j;
B(i, j)=200.0*i+j;
}
}
mult(A, B, C);
행렬 곱셈을 위한 mult 템플릿 표현식
template <typename Matrix>
inline void mult(const Matrix& A, const Matrix& B, Matrix& C){
assert(A.num_rows()==B.num_rows());
typedef typename Matrix::value_type value_type;
unsigned s=A.num_rows();
for(unsigned i=0; i<s; ++i){
for(unsigned k=0; k<s; k++){
value_type tmp(0);
for(unsigned j=0; j<s; j++) tmp+=A(i, j*B(j, k));
C(i, k)=tmp;
}
}
}
mult을 수행하기 위해 세번의 중첩된 반복문을 사용한다.
성능 향상을 위해서 밖의 외부의 2개의 반복문을 메타 튜닝을 이용해서 풀어내고,
내부 반복문에서 블록 연산을 수행한다. 결과를 담는 C는 임시변수를 사용한다.(mult_tmp)
template <unsigned BSize, typename Value>
struct multi_tmp{
typedef multi_tmp<BSize-1, Value> sub_type;
multi_tmp(const Value& v):value(v), sub(v) {}
Value value;
sub_type sub;
Value sum() const { return value+sub.sum(); }
};
template <typename Value>
struct multi_tmp<0, Value>{
multi_tmp(const Value& v) {}
Value sum() const { return 0; }
};
template <unsigned Index0, unsigned Max0, unsigned Index1, unsigned Max1>
struct mult_block{
typedef mult_block<Index0, Max0, Index1+1, Max1> next;
template <typename Tmp, typename Matrix>
void operator()(Tmp& tmp, const Matrix& A, const Matrix& B, unsigned i, unsigned j, unsigned k){
tmp.value+=A(i+Index0, j)*B(j, k+Index1);
next()(tmp.sub, A, B, i, j, k);
}
template <typename Tmp, typename Matrix>
void update(const Tmp& tmp, Matrix& C, unsigned i, unsigned k){
C(i+Index0, k+Index1)=tmp.value;
next().update(tmp.sub, C, i, k);
}
};
template <unsigned Index0, unsigned Max0, unsigned Max1>
struct mult_block<Index0, Max0, Max1, Max1>{
typedef mult_block<Index0+1, Max0, 0, Max1> next;
template <typename Tmp, typename Matrix>
void operator()(Tmp& tmp, const Matrix& A, const Matrix& B, unsigned i, unsigned j, unsigned k){
tmp.value+=A(i+Index0, j)*B(j, k+Max1);
next()(tmp.sub, A, B, i, j, k);
}
template <typename Tmp, typename Matrix>
void update(const Tmp& tmp, Matrix& C, unsigned i, unsigned k){
C(i+Index0, k+Max1)=tmp.value;
next().update(tmp.sub, C, i, k);
}
};
template <unsigned Max0, unsigned Max1>
struct mult_block<Max0, Max0, Max1, Max1>{
template <typename Tmp, typename Matrix>
void operator()(Tmp& tmp, const Matrix& A, const Matrix& B, unsigned i, unsigned j, unsigned k){
tmp.value+=A(i, Max0, j)*B(j, k+Max1);
}
template <typename Tmp, typename Matrix>
void update(const Tmp& tmp, Matrix& C, unsigned i, unsigned k){
C(i+Max0, k+Max1)=tmp.value;
}
};
template <unsigned Size0, unsigned Size1, typename Matrix>
inline void mult(const Matrix& A, const Matrix& B, Matrix& C){
using value_type=typename Matrix::value_type;
unsigned s=A.num_rows();
mult_block<0, Size0-1, 0, Size1-1> block;
for(unsigned int i=0; i<s; i+=Size0){
for(unsigned int k=0; k<s; k+=Size1){
mult_tmp<Size0*Size1, value_type> tmp(value_type(0));
for(unsigned j=0; j<s; j++) block(tmp, A, B, i ,j, k);
block.update(tmp, C, i, k);
}
}
}
단순화(loop2)
재사용할 가능성 증가(첫 번째 특수화를 생략해도 괜찮음)
template <unsigned Index0, unsigned Max0, unsigned Index1, unsigned Max1>
struct loop2{
static const unsigned next_index0=Index0, next_index1=Index1+1;
};
template <unsigned Index0, unsigned Max0, unsigned Max1>
struct loop2<Index0, Max0, Max1, Max1>{
static const unsigned next_index0=Index0+1, next_index1=0;
};
// loop2 ( )
template <unsigned Index0, unsigned Max0, unsigned Index1, unsigned Max1>
struct mult_block{
typedef loop2<Index0, Max0, Index1, Max1> l;
typedef mult_block<l::next_index0, Max0, l::next_index1, Max1> next;
template <typename Tmp, typename Matrix>
void operator()(Tmp& tmp, const Matrix& A, const Matrix& B, unsigned i, unsigned j, unsigned k){
tmp.value+=A(i, Index0, j)*B(j, k+Index1);
next()(tmp.sub, A, B, i, j, k);
}
template <typename Tmp, typename Matrix>
void update(const Tmp& tmp, Matrix& C, unsigned i, unsigned k){
C(i+Index0, k+Index1)=tmp.value;
next().update(tmp.sub, C, i, k);
}
};
//